Data Mining Final Project

Author

RAVLEEN KAUR CHADHA, MRIDULA KALAISELVAN, YASH SHARMA

if(!require(pacman)) 
  install.packages("pacman") 

devtools::install_github("tidyverse/dsbox") 
pacman::p_load(tidyverse, 
               scales, 
               devtools,
               here, 
               plotly) 
# Reading the dataset
hepatitis <- read_csv("HepatitisCdata.csv")
# Handling the NA values by replacing them by the median of each column
hepatitis <- hepatitis %>%
  mutate(across(where(is.numeric), ~ ifelse(is.na(.), median(., na.rm = TRUE), .)))

# Verifying that all NA values have been handled
colSums(is.na(hepatitis))
    ...1 Category      Age      Sex      ALB      ALP      ALT      AST 
       0        0        0        0        0        0        0        0 
     BIL      CHE     CHOL     CREA      GGT     PROT 
       0        0        0        0        0        0 
# Create an interactive scatterplot
p <- plot_ly(data = hepatitis, x = ~AST, y = ~ALT, type = 'scatter', mode = 'markers',
             marker = list(size = 10, color = ~Category, colorscale = 'Viridis'),
             text = ~paste("Category:", Category, "<br>AST:", AST, "<br>ALT:", ALT))
p <- p %>% layout(title = "Interactive Scatterplot: AST vs. ALT",
                  xaxis = list(title = "AST"),
                  yaxis = list(title = "ALT"))
p
# Create a correlation matrix heatmap
cor_matrix <- cor(hepatitis[, sapply(hepatitis, is.numeric)])
heatmap_plot <- plot_ly(z = cor_matrix, type = "heatmap", colorscale = "Viridis") %>%
  layout(title = "Interactive Heatmap of Correlations")
heatmap_plot
# Create enhanced interactive scatterplot
p <- plot_ly(
  data = hepatitis,
  x = ~AST, y = ~ALT, type = 'scatter', mode = 'markers',
  marker = list(
    size = ~Age / 2, # Scale marker size by Age
    color = ~Category, # Color by Category
    colorscale = "Viridis", # Use a Viridis colorscale
    showscale = TRUE # Show color legend
  ),
  text = ~paste(
    "Category:", Category,
    "<br>AST:", AST,
    "<br>ALT:", ALT,
    "<br>Age:", Age
  ) # Dynamic tooltips
) %>%
  layout(
    title = list(
      text = "Interactive Scatterplot: AST vs ALT by Category",
      font = list(size = 20)
    ),
    xaxis = list(
      title = "AST (Aspartate Transaminase)",
      titlefont = list(size = 15)
    ),
    yaxis = list(
      title = "ALT (Alanine Transaminase)",
      titlefont = list(size = 15)
    ),
    legend = list(title = list(text = "Category"), orientation = "h"),
    annotations = list(
      list(
        x = max(hepatitis$AST),
        y = max(hepatitis$ALT),
        text = "Potential Outlier",
        showarrow = TRUE,
        arrowhead = 4,
        ax = -50,
        ay = -50
      )
    )
  )

# Display the interactive scatterplot
p
# Define a colorblind-friendly palette
color_palette <- c("#440154FF", "#3B528BFF", "#21908CFF", "#5DC863FF", "#FDE725FF")

# Create an enhanced interactive scatterplot
p <- plot_ly(
  data = hepatitis,
  x = ~AST, y = ~ALT, type = 'scatter', mode = 'markers',
  color = ~as.factor(Category), # Color by Category
  colors = color_palette, # Apply colorblind-friendly palette
  marker = list(
    size = 8, # Adjust marker size
    opacity = 0.8
  ),
  text = ~paste(
    "Category:", Category,
    "<br>AST:", AST,
    "<br>ALT:", ALT,
    "<br>Age:", Age
  ) # Dynamic tooltips
) %>%
  layout(
    title = list(
      text = "Interactive Scatterplot: AST vs ALT by Category",
      font = list(size = 18)
    ),
    xaxis = list(
      title = "AST (Aspartate Transaminase)",
      titlefont = list(size = 14),
      zeroline = FALSE,
      showgrid = FALSE
    ),
    yaxis = list(
      title = "ALT (Alanine Transaminase)",
      titlefont = list(size = 14),
      zeroline = FALSE,
      showgrid = FALSE
    ),
    legend = list(
      title = list(text = "Category"),
      font = list(size = 12),
      orientation = "h",
      x = 0.5,
      xanchor = "center"
    ),
    plot_bgcolor = "#FFFFFF", # Set background to white (minimal theme)
    paper_bgcolor = "#FFFFFF"
  )

# Display the plot
p